In [ ]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from wordcloud import WordCloud
import string
In [ ]:
import os
import random
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW
from transformers import BertTokenizer, BertModel, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from typing import Union, List
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
In [ ]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Out[ ]:
True
In [ ]:
nltk.download('wordnet')
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Out[ ]:
True
In [ ]:
tweet_data = pd.read_csv('cyberbullying_tweets.csv')
In [ ]:
tweet_data.head()
Out[ ]:
tweet_text cyberbullying_type
0 In other words #katandandre, your food was cra... not_cyberbullying
1 Why is #aussietv so white? #MKR #theblock #ImA... not_cyberbullying
2 @XochitlSuckkks a classy whore? Or more red ve... not_cyberbullying
3 @Jason_Gio meh. :P thanks for the heads up, b... not_cyberbullying
4 @RudhoeEnglish This is an ISIS account pretend... not_cyberbullying
In [ ]:
tweet_data['cyberbullying_type'].value_counts()
Out[ ]:
cyberbullying_type
religion               7998
age                    7992
gender                 7973
ethnicity              7961
not_cyberbullying      7945
other_cyberbullying    7823
Name: count, dtype: int64
In [ ]:
def extract_emojis(text):
    emojis = re.findall(r'[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F1E0-\U0001F1FF]', text)
    return emojis
In [ ]:
tweet_data['emojis'] = tweet_data['tweet_text'].apply(extract_emojis)
In [ ]:
tweet_data['emojis'][21]
Out[ ]:
['😘']
In [ ]:
def preprocess_text(text):
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Remove special characters, numbers, and punctuations (except for hashtags and @mentions)
    text = re.sub("[^a-zA-Z#@]", " ", text)
    # Convert to lowercase
    text = text.lower()
    # Tokenize text
    words = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    exclude =string.punctuation
    words = [word for word in words if word not in exclude]

    # Lemmatize words based on POS tags
    lemmatizer = WordNetLemmatizer()
    tagged_words = pos_tag(words)
    lemmatized_words = []
    for word, tag in tagged_words:
        if tag.startswith('NN'):  # Noun
            pos = 'n'
        elif tag.startswith('VB'):  # Verb
            pos = 'v'
        elif tag.startswith('JJ'):  # Adjective
            pos = 'a'
        else:
            pos = 'n'  # Default to noun
        lemmatized_words.append(lemmatizer.lemmatize(word, pos))
    text = ' '.join(lemmatized_words)
    return text
In [ ]:
#!pip install --upgrade nltk
In [ ]:
nltk.download('wordnet')
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Out[ ]:
True
In [ ]:
tweet_data['clean_text']= tweet_data['tweet_text'].apply(preprocess_text)
In [ ]:
tweet_data
Out[ ]:
tweet_text cyberbullying_type emojis clean_text
0 In other words #katandandre, your food was cra... not_cyberbullying [] word katandandre food crapilicious mkr
1 Why is #aussietv so white? #MKR #theblock #ImA... not_cyberbullying [] aussietv white mkr theblock imacelebrityau tod...
2 @XochitlSuckkks a classy whore? Or more red ve... not_cyberbullying [] xochitlsuckkks classy whore red velvet cupcake
3 @Jason_Gio meh. :P thanks for the heads up, b... not_cyberbullying [] jason gio meh p thanks head concern another an...
4 @RudhoeEnglish This is an ISIS account pretend... not_cyberbullying [] rudhoeenglish isi account pretend kurdish acco...
... ... ... ... ...
47687 Black ppl aren't expected to do anything, depe... ethnicity [] black ppl expect anything depend anything yet ...
47688 Turner did not withhold his disappointment. Tu... ethnicity [] turner withhold disappointment turner call cou...
47689 I swear to God. This dumb nigger bitch. I have... ethnicity [] swear god dumb nigger bitch get bleach hair re...
47690 Yea fuck you RT @therealexel: IF YOURE A NIGGE... ethnicity [] yea fuck rt therealexel youre nigger fuck unfo...
47691 Bro. U gotta chill RT @CHILLShrammy: Dog FUCK ... ethnicity [] bro u get ta chill rt chillshrammy dog fuck kp...

47692 rows × 4 columns

In [ ]:
# import emoji
In [ ]:
#!pip install demoji
Collecting demoji
  Downloading demoji-1.1.0-py3-none-any.whl.metadata (9.2 kB)
Downloading demoji-1.1.0-py3-none-any.whl (42 kB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 42.9/42.9 kB 622.2 kB/s eta 0:00:000:00:01
Installing collected packages: demoji
Successfully installed demoji-1.1.0
In [ ]:
import demoji

demoji.download_codes()
C:\Users\hp\AppData\Local\Temp\ipykernel_6064\1306366529.py:3: FutureWarning: The demoji.download_codes attribute is deprecated and will be removed from demoji in a future version. It is an unused attribute as emoji codes are now distributed directly with the demoji package.
  demoji.download_codes()
In [ ]:
tweet_data['emojis_description'] = tweet_data['emojis'].apply(lambda x: ' '.join(demoji.findall(' '.join(x)).values()) if x else '')

print(tweet_data['emojis_description'])
0         
1         
2         
3         
4         
        ..
47687     
47688     
47689     
47690     
47691     
Name: emojis_description, Length: 47692, dtype: object
In [ ]:
tweet_data['emojis_description']= tweet_data['emojis_description'].apply(preprocess_text)
In [ ]:
tweet_data['combined_text'] = tweet_data['clean_text'] + ' ' + tweet_data['emojis_description']
In [ ]:
 
In [ ]:
tweet_data['combined_text'][21]
Out[ ]:
'kid love mohamad bin zayed city face blow kiss'
In [ ]:
from transformers import BertTokenizer, BertModel
import torch
In [ ]:
text_label_0 = ' '.join(tweet_data[tweet_data['cyberbullying_type'] == 'religion']['tweet_text'].values)

text_label_1 = ' '.join(tweet_data[tweet_data['cyberbullying_type'] == 'age']['tweet_text'].values)

text_label_2 = ' '.join(tweet_data[tweet_data['cyberbullying_type'] == 'gender']['tweet_text'].values)

text_label_3 = ' '.join(tweet_data[tweet_data['cyberbullying_type'] == 'ethnicity']['tweet_text'].values)

text_label_4 = ' '.join(tweet_data[tweet_data['cyberbullying_type'] == 'not_cyberbullying']['tweet_text'].values)

text_label_5 = ' '.join(tweet_data[tweet_data['cyberbullying_type'] == 'other_cyberbullying']['tweet_text'].values)

# Generate word cloud for label 0
wordcloud_label_0 = WordCloud(width=800, height=800, max_words=1000, background_color='white').generate(text_label_0)
plt.figure(figsize=(8, 8), facecolor=None)
plt.imshow(wordcloud_label_0)
plt.axis("off")
plt.title('Word Cloud for Religion')
plt.tight_layout(pad=0)
plt.show()

wordcloud_label_1 = WordCloud(width=800, height=800, max_words=1000, background_color='white').generate(text_label_1)
plt.figure(figsize=(8, 8), facecolor=None)
plt.imshow(wordcloud_label_1)
plt.axis("off")
plt.title('Word Cloud for Age')
plt.tight_layout(pad=0)
plt.show()


wordcloud_label_2 = WordCloud(width=800, height=800, max_words=1000, background_color='white').generate(text_label_2)
plt.figure(figsize=(8, 8), facecolor=None)
plt.imshow(wordcloud_label_2)
plt.axis("off")
plt.title('Word Cloud for Gender')
plt.tight_layout(pad=0)
plt.show()


wordcloud_label_3 = WordCloud(width=800, height=800, max_words=1000, background_color='white').generate(text_label_3)
plt.figure(figsize=(8, 8), facecolor=None)
plt.imshow(wordcloud_label_3)
plt.axis("off")
plt.title('Word Cloud for Ethnicity')
plt.tight_layout(pad=0)
plt.show()


wordcloud_label_4 = WordCloud(width=800, height=800, max_words=1000, background_color='white').generate(text_label_4)
plt.figure(figsize=(8, 8), facecolor=None)
plt.imshow(wordcloud_label_4)
plt.axis("off")
plt.title('Word Cloud for Not Cyberbullying')
plt.tight_layout(pad=0)
plt.show()


wordcloud_label_5 = WordCloud(width=800, height=800, max_words=1000, background_color='white').generate(text_label_5)
plt.figure(figsize=(8, 8), facecolor=None)
plt.imshow(wordcloud_label_5)
plt.axis("off")
plt.title('Word Cloud for Other_cyberbullying')
plt.tight_layout(pad=0)
plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [ ]:
print(f'There are around {int(tweet_data["combined_text"].duplicated().sum())} duplicated tweets, we will remove them.')
There are around 2367 duplicated tweets, we will remove them.
In [ ]:
tweet_data.drop_duplicates("combined_text", inplace=True)
In [ ]:
tweet_data['cyberbullying_type'].value_counts()
Out[ ]:
cyberbullying_type
religion               7955
age                    7886
ethnicity              7865
not_cyberbullying      7826
gender                 7704
other_cyberbullying    6089
Name: count, dtype: int64
In [ ]:
tweet_data['text_len'] = [len(text.split()) for text in tweet_data.tweet_text]
In [ ]:
tweet_data.sort_values(by=['text_len'], ascending=False)
Out[ ]:
tweet_text cyberbullying_type emojis clean_text emojis_description combined_text text_len
29205 is feminazi an actual word with a denot…\r\n@N... other_cyberbullying [🙉] feminazi actual word denot nasharchy job mean ... hear evil monkey feminazi actual word denot nasharchy job mean ... 790
24516 @NICKIMINAJ: #WutKinda\r\nAt this rate the MKR... other_cyberbullying [] nickiminaj wutkinda rate mkr final december mk... nickiminaj wutkinda rate mkr final december mk... 692
30752 I don't retreat.\r\nyessssssss http://t.co/Td9... other_cyberbullying [] retreat yes uh make grownups boruto look lit b... retreat yes uh make grownups boruto look lit b... 505
44035 You so black and white trying to live like a n... ethnicity [] black white try live like nigger pahahahaha co... black white try live like nigger pahahahaha co... 329
1317 @EurekAlertAAAS: Researchers push to import to... not_cyberbullying [💩, 👍, 💔, 👎] eurekalertaaas researcher push import top anti... pile poo thumb broken heart thumb eurekalertaaas researcher push import top anti... 289
... ... ... ... ... ... ... ...
334 FFS not_cyberbullying [] ffs ffs 1
8457 #IndiasDaughter gender [] indiasdaughter indiasdaughter 1
8165 good! gender [] good good 1
307 :D not_cyberbullying [] 1
314 Preseason not_cyberbullying [] preseason preseason 1

45325 rows × 7 columns

In [ ]:
plt.figure(figsize=(20,5))
ax = sns.countplot(x='text_len', data=tweet_data[tweet_data['text_len']<=1000], palette='Blues_r')
plt.title('Count of tweets vs length ', fontsize=20)
plt.yticks([])
ax.bar_label(ax.containers[0])
plt.ylabel('count')
plt.xlabel('')
plt.show()
In [ ]:
tweet_data = tweet_data[tweet_data['text_len'] < tweet_data['text_len'].quantile(0.995)]
In [ ]:
max_len = np.max(tweet_data['text_len'])
max_len
Out[ ]:
56
In [ ]:
MAX_LEN =100
In [ ]:
from keras.preprocessing.sequence import pad_sequences
2024-04-18 10:33:06.328212: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-18 10:33:06.328346: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-18 10:33:06.499978: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
In [ ]:
label_encoder = LabelEncoder()
tweet_data['cyberbullying_type_label'] = label_encoder.fit_transform(tweet_data['cyberbullying_type'])
print(dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))))
{'age': 0, 'ethnicity': 1, 'gender': 2, 'not_cyberbullying': 3, 'other_cyberbullying': 4, 'religion': 5}
/tmp/ipykernel_76/246362075.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tweet_data['cyberbullying_type_label'] = label_encoder.fit_transform(tweet_data['cyberbullying_type'])
In [ ]:
X = tweet_data['combined_text'].values
y = tweet_data['cyberbullying_type_label'].values
In [ ]:
class TextClassificationDataset(Dataset):

    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label)
        }
In [ ]:
class BERTClassifier(nn.Module):
    def __init__(self, bert_model_name, num_classes):
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        x = self.dropout(pooled_output)
        logits = self.fc(x)
        return logits
In [ ]:
from tqdm import tqdm

def train(model, data_loader, optimizer, scheduler, device):
    model.train()
    total_loss = 0
    with tqdm(total=len(data_loader), desc="Training") as progress_bar:
        for batch in data_loader:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = nn.CrossEntropyLoss()(outputs, labels)
            loss.backward()
            optimizer.step()
            scheduler.step()
            total_loss += loss.item()
            progress_bar.update(1)
    return total_loss / len(data_loader)
In [ ]:
def evaluate(model, data_loader, device):
    model.eval()
    predictions = list()
    actual = list()
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
            predictions.extend(preds.cpu().tolist())
            actual.extend(labels.cpu().tolist())
    return accuracy_score(y_true=actual, y_pred=predictions), classification_report(y_true=actual, y_pred=predictions)
In [ ]:
def predict_class(text, model, tokenizer,classes, max_length=128):
    model.eval()
    encoding = tokenizer(text, return_tensors='pt', max_length=max_length, padding='max_length', truncation=True)
    input_ids = encoding['input_ids']
    attention_mask = encoding['attention_mask']

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs, dim=1)

    return classes[preds.item()]
In [ ]:
bert_model_name = 'bert-base-uncased'
num_classes = 6
max_length = 128
batch_size = 32
num_epochs = 5
learning_rate = 2e-5
In [ ]:
train_texts, val_texts, train_labels, val_labels = train_test_split(X, y, test_size=0.2, random_state=42)
len(train_texts), len(val_texts), len(train_labels), len(val_labels)
Out[ ]:
(36014, 9004, 36014, 9004)